RMD_example 7.1: Read the Gene expression microarray data (samplexprs.csv)

# Get the file url
fileurl <- "http://ghuang.stat.nctu.edu.tw/course/statmethods16/files/data/samplexprs.csv"

# Import the csv file into R
#   note: stringsAsFactors=TRUE will screw up conversion to numeric!
samplexprs <- read.csv(fileurl, header=TRUE, stringsAsFactors=FALSE)

dim(samplexprs)
## [1]   78 4746
head(samplexprs[,1:10])
##     id age metastases followup ERp J00129 Contig29982_RC Contig42854
## 1 FG80  52          0     7.35 100 -0.795         -0.387       0.199
## 2 SF58  50          1     1.15   0 -0.509          0.459      -0.257
## 3 DE72  54          0    12.12 100 -0.961         -0.631       0.037
## 4 DE65  40          0     6.25   0 -0.749          0.699      -0.346
## 5 HG87  53          0     5.18   0 -0.426         -0.406      -0.355
## 6 HG88  37          1     1.09 100 -0.566         -0.596      -0.352
##   Contig42014_RC Contig27915_RC
## 1         -0.247          0.176
## 2         -0.065          0.129
## 3         -0.153          0.144
## 4          0.032          0.300
## 5          0.429         -0.036
## 6         -0.336          0.037

RMD_example 7.2: Stem-and-leaf plot

stem(samplexprs$age)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   2 | 88
##   3 | 00234
##   3 | 677788889999
##   4 | 00111111112333444
##   4 | 55555666678888888999
##   5 | 0012222222333444444444
stem(samplexprs$J0012)
## 
##   The decimal point is at the |
## 
##   -2 | 0
##   -1 | 
##   -1 | 321000
##   -0 | 9999998888888888777777776666666666666665555555555555
##   -0 | 4444444433221
##    0 | 01
##    0 | 689
##    1 | 2

RMD_example 7.3: q-q plot

For q-q plot, we can first compute the percentiles for our list of data (ages/J0012) and for the normal distribution, and then compare them.

library(rafalib)

# plot for age
x <- samplexprs$age
ps <- ( seq(0,99) + 0.5 )/100
qs <- quantile(x, ps)
normalqs <- qnorm(ps, mean(x), popsd(x))
plot(normalqs,qs,xlab="Normal percentiles",ylab="Age percentiles")
abline(0,1, col="red") ##identity line

# plot for J0012
y <- samplexprs$J0012
ps <- ( seq(0,99) + 0.5 )/100
qs <- quantile(y, ps)
normalqs <- qnorm(ps, mean(y), popsd(y))
plot(normalqs,qs,xlab="Normal percentiles",ylab="J0012 percentiles")
abline(0,1, col="red") ##identity line

We can also see the plot with the existing function qqnorm.

qqnorm(x); qqline(x, col="red")

qqnorm(y); qqline(y, col="red")

Notice that the qqnorm function plots against a standard normal distribution. This is why the line has slope popsd(x) (or popsd(y)) and intercept mean(x) (or mean(y)).

In fact, we can run Monte Carlo simulations to see plots like this for data known to be normally distributed.

n <-1000
x <- rnorm(n)
qqnorm(x)
qqline(x, col="red")

We can also get a sense for how non-normally distributed data will look in a qq-plot. Here we generate data from the t-distribution with different degrees of freedom. Notice that the smaller the degrees of freedom, the fatter the tails. We call these “fat tails” because if we plotted an empirical density or histogram, the density at the extremes would be higher than the theoretical curve. In the qq-plot, this can be seen in that the curve is lower than the identity line on the left side and higher on the right side. This means that there are more extreme values than predicted by the theoretical density plotted on the x-axis.

dfs <- c(3,6,12,30)
mypar(2,2)
for(df in dfs){
  x <- rt(1000,df)
  qqnorm(x,ylab="t quantiles",main=paste0("d.f.=",df),ylim=c(-6,6))
  qqline(x, col="red")
}

RMD_example 7.4: Histogram

# Install package "ggplot2" first, then load it into R
library(ggplot2)

# with qplot
qplot(J00129, data=samplexprs, geom="histogram", binwidth=0.1)

qplot(J00129, data=samplexprs, geom="histogram", binwidth=0.05)

# with ggplot
h <- ggplot(samplexprs, aes(x=J00129))
h + geom_histogram(binwidth=0.05)

#    display a smooth density estimate
h + geom_histogram(aes(y = ..density..), binwidth=0.05) + geom_density()

RMD_example 7.5: Boxplot (=1)

qplot(x=factor(1), y=J00129, data=samplexprs, geom="boxplot", xlab="")

b <- ggplot(samplexprs, aes(x=factor(1), y=J00129))
b + geom_boxplot() + labs(x="")

b + geom_boxplot() + labs(x="") + coord_flip()

b + geom_boxplot(fill="grey80", colour="#3366FF") + labs(x="") + coord_flip()

RMD_example 7.6: Bar chart

ba <- ggplot(samplexprs, aes(x=factor(ERp)))
ba + geom_bar()

ba + geom_bar(fill="white", colour="darkgreen")

qplot(factor(ERp), data=samplexprs, geom="bar", fill=factor(ERp))

RMD_example 7.7: Pie chart

p <- ggplot(samplexprs, aes(x=factor(1), fill=factor(ERp)))
p + geom_bar(width=1, position="fill") + coord_polar(theta="y")

RMD_example 7.8: Scatterplot

s <- ggplot(samplexprs, aes(J00129, Contig29982_RC))
s + geom_point()

# Add trend line
s + geom_point() + stat_smooth()
## `geom_smooth()` using method = 'loess'

RMD_example 7.9: Read exprs_sig.csv

# Get the file url
fileurl <- "http://ghuang.stat.nctu.edu.tw/course/statmethods16/files/data/exprs_sig.csv"

# Import the csv file into R
#   note: stringsAsFactors=TRUE will screw up conversion to numeric!
exprs_sig <- read.csv(fileurl, header=TRUE, stringsAsFactors=FALSE)
#   note: read.csv treats the 1st column of exprs_sig.csv as the data, not the rownames,
#         we have to correct this
#         get the 1st column as the rownames
names1 <- exprs_sig[,1]
#         remove the 1st column from exprs_sig
exprs_sig <- exprs_sig[,-1]
#         assign the rownames
rownames(exprs_sig) <- names1
colnames(exprs_sig)
##  [1] "FG80" "SF58" "DE72" "DE65" "HG87" "HG88" "AB22" "HG91" "HG92" "KH11"
## [11] "KH20" "SF67" "LD44" "AA04" "AA01" "GL73" "AA10" "HG86" "DE62" "AB26"
## [21] "SF57" "DE61" "DD41" "AB31" "LD30" "SF66" "GL75" "AA05" "LD37" "FG78"
## [31] "DD42" "DD44" "KH99" "SF60" "DE66" "FG79" "FG83" "DD50" "DD46" "LD56"
## [41] "AA15" "AA02" "FG77" "HG93" "GL77" "HG90" "DD40" "GL71" "DD49" "LD33"
## [51] "KH15" "FG81" "DE71" "FG82" "DE73" "HG85" "LD45" "KH22" "LD35" "AA16"
## [61] "AA12" "DE63" "LD39" "DE70" "AB32" "AB20" "HG89" "SF69" "LD27" "AB24"
## [71] "AA03" "GL84" "DD43" "DD52" "FG75" "GL72" "KH14" "KH17"
dim(exprs_sig)
## [1] 4741   78
head(exprs_sig[,1:10])
##                  FG80   SF58   DE72   DE65   HG87   HG88   AB22   HG91
## J00129         -0.795 -0.509 -0.961 -0.749 -0.426 -0.566 -0.420 -0.499
## Contig29982_RC -0.387  0.459 -0.631  0.699 -0.406 -0.596 -0.286 -0.402
## Contig42854     0.199 -0.257  0.037 -0.346 -0.355 -0.352 -0.090  0.181
## Contig42014_RC -0.247 -0.065 -0.153  0.032  0.429 -0.336 -0.048  0.143
## Contig27915_RC  0.176  0.129  0.144  0.300 -0.036  0.037  0.291 -0.268
## Contig20156_RC -0.129  0.009 -0.202 -0.025  0.191 -0.147 -0.166  0.849
##                  HG92   KH11
## J00129         -0.465 -0.189
## Contig29982_RC -0.533 -0.309
## Contig42854    -0.019 -0.152
## Contig42014_RC  0.019  0.918
## Contig27915_RC -0.388  0.585
## Contig20156_RC -0.012 -0.144

RMD_example 7.10: Scatterplot for big data

sh <- ggplot(exprs_sig, aes(AA01, SF58))
sh + geom_point()

# Varying alpha is useful for large datasets
sh + geom_point(alpha = 0.3)

# Add trend line
sh + geom_point(alpha = 0.3) + stat_smooth()
## `geom_smooth()` using method = 'gam'

# Or adding heatmap of 2d bin
sh + geom_bin2d()

RMD_example 7.11: Boxplot (>1)

# With qplot
qplot(x=factor(metastases), y=J00129, data=samplexprs, geom="boxplot")

qplot(x=factor(ERp), y=J00129, data=samplexprs, geom="boxplot")

# With ggplot
b2 <- ggplot(samplexprs, aes(x=factor(metastases), y=J00129))
b2 + geom_boxplot()

b2 <- ggplot(samplexprs, aes(x=factor(ERp), y=J00129))
b2 + geom_boxplot()

RMD_example 7.12: Stacked bar chart

sba <- ggplot(samplexprs, aes(x=factor(ERp), fill=factor(metastases)))
# y is count
sba + geom_bar()

sba + geom_bar() + coord_flip()

# y is proportion
sba + geom_bar(position="fill") + labs(y="proportion")

RMD_example 7.13: Faceting bar charts

fba <- ggplot(samplexprs, aes(x=factor(ERp)))
fba + geom_bar() + facet_wrap(~metastases)

RMD_example 7.14: Stacked area chart

# For ERp+metastases
#   Create 2x2 table for ERp vs. metastases and obtain counts
a<-xtabs(~ERp+metastases, data=samplexprs)
b<-data.frame(b1=c(a), b2=rep(as.numeric(colnames(a)), rep(dim(a)[1],dim(a)[2])), b3=rep(as.numeric(rownames(a)),dim(a)[2]))
#   Stacked area chart
sa <- ggplot(b, aes(x=b3, y=b1, fill=factor(b2)))
sa + geom_area() + labs(x="ERp", y="count", fill="metastases")

sa + geom_area(position="fill") + labs(x="ERp", y="proportion", fill="metastases")

# For age+metastases
#   Create 2x2 table for age vs. metastases and obtain counts
a<-xtabs(~age+metastases, data=samplexprs)
b<-data.frame(b1=c(a), b2=rep(as.numeric(colnames(a)), rep(dim(a)[1],dim(a)[2])), b3=rep(as.numeric(rownames(a)),dim(a)[2]))
#   Stacked area chart
sa <- ggplot(b, aes(x=b3, y=b1, fill=factor(b2)))
sa + geom_area() + labs(x="age", y="count", fill="metastases")

sa + geom_area(position="fill") + labs(x="age", y="proportion", fill="metastases")

RMD_example 7.15: Time series plot

# Connect observations, ordered by x value
t <- ggplot(samplexprs, aes(x=age, y=J00129, group=metastases))
t + geom_line(aes(colour=factor(metastases)), size=1)

RMD_example 7.16: 3d scatterplot

# Install the package "lattice", then load it into R
library(lattice)

# 3d scatterplot
cloud(age~J00129+Contig29982_RC, data=samplexprs)

cloud(AA01~SF58+LD37, data=exprs_sig)

#    lattice in the 3rd dim
xyplot(Contig29982_RC~J00129 | factor(metastases), data=samplexprs)

xyplot(Contig29982_RC~J00129 | factor(ERp), data=samplexprs)

#    map the 3rd dim to colors
d3c <- ggplot(samplexprs, aes(J00129, Contig29982_RC))
d3c + geom_point(aes(colour=age))

d3c + geom_point(aes(colour=factor(metastases)))

d3c + geom_point(aes(colour=factor(ERp)))

#    lay out panels in the 3rd dim
d3c + geom_point() + facet_grid(.~metastases)

d3c + geom_point() + facet_grid(.~ERp)

RMD_example 7.17: Scatterplot matrices

pairs(samplexprs[,c('age','ERp','J00129','Contig29982_RC')])

pairs(samplexprs[,c('age','ERp','J00129','Contig29982_RC')], col=as.integer(samplexprs[,'metastases'])+1)

pairs(samplexprs[,c('age','ERp','J00129','Contig29982_RC')], col=as.integer(samplexprs[,'metastases'])+1,
      panel=panel.smooth)

RMD_example 7.18: Heatmap

# Heatmap
a <- exprs_sig[sample(1:dim(exprs_sig)[1], size=70),]
#   no dendrogram
heatmap(as.matrix(a), Rowv=NA, Colv=NA,
        xlab="patients", ylab="genes",
        cexRow=0.5, cexCol=0.6)

#   with dendrogram
heatmap(as.matrix(a),
        xlab="patients", ylab="genes",
        cexRow=0.5, cexCol=0.6)

#   with different color schemes
#     install the package "RColorBrewer", then load it into R
library(RColorBrewer)

heatmap(as.matrix(a), col=brewer.pal(9, "BuGn"),
        xlab="patients", ylab="genes",
        cexRow=0.5, cexCol=0.6)

#   use heatmap.2 and self-set dendrogram
#     install packages "graphics", "gplots", "RColorBrewer"", then load them into R
library(graphics)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(RColorBrewer)

a <- exprs_sig[sample(1:dim(exprs_sig)[1], size=70),]
hcpat <- hclust(dist(t(a)), method="average")
dendpat <- as.dendrogram(hcpat)
hcgene <- hclust(as.dist(1-cor(t(a))), method="average")
dendgene <- as.dendrogram(hcgene)

#       create a blue -> purple colour palette
jBuPuFun <- colorRampPalette(brewer.pal(n = 9, "BuPu"))
paletteSize <- 256
jBuPuPalette <- jBuPuFun(paletteSize)

heatmap.2(as.matrix(a), Rowv=dendgene, Colv=dendpat, col=jBuPuPalette,
          xlab="patients", ylab="genes", scale="none", trace="none",
          cexRow=0.5, cexCol=0.6, density.info="none")